1.Over - Under Analysis
oo = readRDS("/Users/ilkerkurtulus/Documents/cse-master/ie582/data/hw1/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds")
oo = data.table(oo)
oo$date = anytime(oo$date)
mm = readRDS("/Users/ilkerkurtulus/Documents/cse-master/ie582/data/hw1/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds")
mm = data.table(mm)
mm[, `:=`(c("leagueId", "type"), NULL)]
mm$date = anytime(mm$date)
mm = mm[is.na(mm$score) == FALSE]
mm = mm[, `:=`(c("home_goal", "away_goal"), tstrsplit(score,
":", fixed = TRUE))]
mm$home_goal = as.numeric(mm$home_goal)
mm$away_goal = as.numeric(mm$away_goal)
mm[, `:=`(total_goals, home_goal + away_goal)]
mm = mutate(mm, is_over = ifelse(total_goals > 2.5, 1, 0))
mm = data.table(mm)
head(mm)
## matchId home away score date
## 1: KjF6FiA6 tottenham manchester city 0:0 2010-08-14 15:45:00
## 2: ILVbJgQm aston villa west ham 3:0 2010-08-14 18:00:00
## 3: SGIEDVvJ wolves stoke city 2:1 2010-08-14 18:00:00
## 4: YwL5xFHJ bolton fulham 0:0 2010-08-14 18:00:00
## 5: lQJAEBPC wigan blackpool 0:4 2010-08-14 18:00:00
## 6: byRcHDuf sunderland birmingham 2:2 2010-08-14 18:00:00
## home_goal away_goal total_goals is_over
## 1: 0 0 0 0
## 2: 3 0 3 1
## 3: 2 1 3 1
## 4: 0 0 0 0
## 5: 0 4 4 1
## 6: 2 2 4 1
Different handicap values mean different events with different probability. So we should not considered them together. Thats why from “ah” betType i will pick totalhandicap = 0 and for “ou” betType lets pick 2.5
While choosing bookmakers we need to care that bookmakers should provide odds of above handicap and bettype. To do that lets print them and choose:
oo[(betType == "ou") & (totalhandicap == "2.5"), .N, by = bookmaker]
## bookmaker N
## 1: 1xBet 10187
## 2: bet-at-home 11838
## 3: bet365 13809
## 4: Betclic 12011
## 5: BetVictor 14063
## 6: Betway 11041
## 7: bwin 11528
## 8: Expekt 9664
## 9: Paddy Power 12492
## 10: Unibet 10905
## 11: William Hill 5494
## 12: youwin 11280
## 13: Betsafe 14853
## 14: Betsson 15276
## 15: Sportingbet 10672
## 16: Tipico 10046
## 17: Pinnacle 14610
## 18: 10Bet 36443
## 19: 12BET 11148
## 20: 188BET 12306
## 21: ComeOn 11497
## 22: SBOBET 10418
## 23: Interwetten 7279
## 24: 888sport 7319
## 25: Betfair 6219
## 26: Betfair Exchange 14914
## bookmaker N
oo[(betType == "ah") & (totalhandicap == "0"), .N, by = bookmaker]
## bookmaker N
## 1: 1xBet 10584
## 2: bet365 14846
## 3: Interwetten 7285
## 4: Unibet 5283
## 5: BetVictor 6863
## 6: Paddy Power 4972
## 7: Pinnacle 9853
## 8: Betfair Exchange 12309
## 9: 10Bet 9758
## 10: 188BET 6847
## 11: ComeOn 5745
## 12: 12BET 5796
## 13: SBOBET 5568
## 14: Betsson 322
## 15: Betsafe 44
## 16: youwin 60
## 17: Expekt 30
So we can select 5 bookmakers as 1xBet, bet365, Betfair Exchange, Pinnacle and 10Bet
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_1a = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ah = 0 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
all_df = all_df[, `:=`(is_over, as.character(is_over))]
pca = prcomp(all_df[, 3:9], center = TRUE, scale. = TRUE)
print(summary(pca))
eigs = pca$sdev^2
exp_var_ratio = eigs/sum(eigs)
cum_exp_var_ratio = cumsum(exp_var_ratio)
plot(cum_exp_var_ratio, type = "l", xlab = "# of Principle Components",
ylab = "Cumulative Explained Variance")
title(paste("Cumulative Explained Variance Ratio of PCA for ",
bookmakers[n_bm], sep = ""))
all_pca = predict(pca, newdata = all_df[, 3:9])
all_pca3d = all_pca[, 1:3]
all_pca3d = data.table(all_pca3d)
all_pca3d[, `:=`(is_over, all_df$is_over)]
# 3d plot with 1st, 2nd and 3rd components
plot_ly(all_pca3d, x = ~PC1, y = ~PC2, z = ~PC3, colors = c("#132B43",
"#56B1F7"), color = ~is_over, type = "scatter3d", mode = "markers") %>%
layout(title = paste("Transformed Data with p = 3 PCA and is_over results",
bookmakers[n_bm], sep = " "))
}
func_1a(1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8620 1.4220 1.1528 0.31994 0.25363 0.09678
## Proportion of Variance 0.4953 0.2889 0.1898 0.01462 0.00919 0.00134
## Cumulative Proportion 0.4953 0.7842 0.9740 0.98863 0.99782 0.99916
## PC7
## Standard deviation 0.07657
## Proportion of Variance 0.00084
## Cumulative Proportion 1.00000
func_1a(2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7096 1.5209 1.2876 0.21767 0.20100 0.12273
## Proportion of Variance 0.4175 0.3304 0.2369 0.00677 0.00577 0.00215
## Cumulative Proportion 0.4175 0.7480 0.9848 0.99161 0.99738 0.99953
## PC7
## Standard deviation 0.05739
## Proportion of Variance 0.00047
## Cumulative Proportion 1.00000
12Bet is very compact. It is hard to find patterns from the data by looking at the graph.
func_1a(3)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7306 1.3078 1.2174 0.65847 0.53327 0.27811
## Proportion of Variance 0.4279 0.2443 0.2117 0.06194 0.04063 0.01105
## Cumulative Proportion 0.4279 0.6722 0.8839 0.94584 0.98647 0.99752
## PC7
## Standard deviation 0.13175
## Proportion of Variance 0.00248
## Cumulative Proportion 1.00000
Result of 188Bet is again hard predict. p=4 is a good choice for this bookmaker.
func_1a(4)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8741 1.6091 0.8743 0.28487 0.16779 0.12440
## Proportion of Variance 0.5018 0.3699 0.1092 0.01159 0.00402 0.00221
## Cumulative Proportion 0.5018 0.8717 0.9809 0.99246 0.99648 0.99869
## PC7
## Standard deviation 0.09562
## Proportion of Variance 0.00131
## Cumulative Proportion 1.00000
The most interesting graph is bet-at-home’s pca visualization. Its a 3d V-shape. If you look at PC3 axis, projection of the points to the PC3 axis will be very close which makes interpretation harder compare to others. This interesting plot actually drives from explained variance ratio. With the first 3 components we can explain 96% variance of data which is a very good choice to pick for component number.
func_1a(5)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.9234 1.5186 0.71947 0.61145 0.23300 0.17029
## Proportion of Variance 0.5285 0.3295 0.07395 0.05341 0.00776 0.00414
## Cumulative Proportion 0.5285 0.8579 0.93188 0.98529 0.99305 0.99719
## PC7
## Standard deviation 0.14018
## Proportion of Variance 0.00281
## Cumulative Proportion 1.00000
At bet365 cumulative variance of the first 3 components are lower than bet-at-home. Distortion of the v shape also confirms that.
Calculation of manhattan and euclidian distances as well as 2D and 3D of them with MDS:
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_1bman = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
all_df = all_df[, `:=`(is_over, as.character(is_over))]
dist_man = dist(all_df[, 3:9], method = "manhattan")
mds_man3 = data.table(cmdscale(dist_man, eig = TRUE, k = 3)$points)
mds_man3[, `:=`(is_over, all_df$is_over)]
plot_ly(mds_man3, x = ~V1, y = ~V2, z = ~V3, color = ~is_over,
type = "scatter3d", mode = "markers", domain = list(x = c(0,
1), y = c(0.5, 1))) %>% layout(title = paste("MDS Manhattan and is_over results",
bookmakers[n_bm], sep = " "))
}
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_1beuc = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
all_df = na.omit(pdf[mm[, c("matchId", "is_over")], on = "matchId"])
all_df = all_df[, `:=`(is_over, as.character(is_over))]
dist_euc = dist(all_df[, 3:9], method = "euclidian")
mds_euc3 = data.table(cmdscale(dist_euc, eig = TRUE, k = 3)$points)
mds_euc3[, `:=`(is_over, all_df$is_over)]
plot_ly(mds_euc3, x = ~V1, y = ~V2, z = ~V3, color = ~is_over,
type = "scatter3d", mode = "markers", domain = list(x = c(0,
1), y = c(0, 0.5))) %>% layout(title = paste("MDS Euclidian and is_over results",
bookmakers[n_bm], sep = " "))
}
func_1beuc(1)
func_1bman(1)
func_1beuc(2)
func_1bman(2)
func_1beuc(3)
func_1bman(3)
func_1beuc(4)
func_1bman(4)
func_1beuc(5)
func_1bman(5)
- Part C
Feature engineering for this part:
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_2a = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal ==
away_goal, "x", "2")))]
all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
pca = prcomp(all_1x2[, 3:9], center = TRUE, scale. = TRUE)
print(summary(pca))
eigs = pca$sdev^2
exp_var_ratio = eigs/sum(eigs)
cum_exp_var_ratio = cumsum(exp_var_ratio)
plot(cum_exp_var_ratio, type = "l", xlab = "# of Principle Components",
ylab = "Cumulative Explained Variance")
title(paste("1x2 Results - Cumulative Explained Variance Ratio of PCA for ",
bookmakers[n_bm], sep = ""))
all_pca = predict(pca, newdata = all_1x2[, 3:9])
all_pca3d = all_pca[, 1:3]
all_pca3d = data.table(all_pca3d)
all_pca3d[, `:=`(is_1x2, all_1x2$is_1x2)]
# 3d plot with 1st, 2nd and 3rd components
plot_ly(all_pca3d, x = ~PC1, y = ~PC2, z = ~PC3, color = ~is_1x2,
type = "scatter3d", mode = "markers") %>% layout(title = paste("Transformed Data with p = 3 PCA and is_1x2 results",
bookmakers[n_bm], sep = " "))
}
func_2a(1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8620 1.4220 1.1528 0.31994 0.25363 0.09678
## Proportion of Variance 0.4953 0.2889 0.1898 0.01462 0.00919 0.00134
## Cumulative Proportion 0.4953 0.7842 0.9740 0.98863 0.99782 0.99916
## PC7
## Standard deviation 0.07657
## Proportion of Variance 0.00084
## Cumulative Proportion 1.00000
func_2a(2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7096 1.5209 1.2876 0.21767 0.20100 0.12273
## Proportion of Variance 0.4175 0.3304 0.2369 0.00677 0.00577 0.00215
## Cumulative Proportion 0.4175 0.7480 0.9848 0.99161 0.99738 0.99953
## PC7
## Standard deviation 0.05739
## Proportion of Variance 0.00047
## Cumulative Proportion 1.00000
func_2a(3)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7306 1.3078 1.2174 0.65847 0.53327 0.27811
## Proportion of Variance 0.4279 0.2443 0.2117 0.06194 0.04063 0.01105
## Cumulative Proportion 0.4279 0.6722 0.8839 0.94584 0.98647 0.99752
## PC7
## Standard deviation 0.13175
## Proportion of Variance 0.00248
## Cumulative Proportion 1.00000
func_2a(4)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.8741 1.6091 0.8743 0.28487 0.16779 0.12440
## Proportion of Variance 0.5018 0.3699 0.1092 0.01159 0.00402 0.00221
## Cumulative Proportion 0.5018 0.8717 0.9809 0.99246 0.99648 0.99869
## PC7
## Standard deviation 0.09562
## Proportion of Variance 0.00131
## Cumulative Proportion 1.00000
func_2a(5)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.9234 1.5186 0.71947 0.61145 0.23300 0.17029
## Proportion of Variance 0.5285 0.3295 0.07395 0.05341 0.00776 0.00414
## Cumulative Proportion 0.5285 0.8579 0.93188 0.98529 0.99305 0.99719
## PC7
## Standard deviation 0.14018
## Proportion of Variance 0.00281
## Cumulative Proportion 1.00000
Calculation of manhattan and euclidian distances as well as 2D and 3D of them with MDS:
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_2beuc = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal ==
away_goal, "x", "2")))]
all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
all_1x2 = all_1x2[, `:=`(is_1x2, as.character(is_1x2))]
dist_euc = dist(all_1x2[, 3:9], method = "euclidian")
mds_euc3 = data.table(cmdscale(dist_euc, eig = TRUE, k = 3)$points)
mds_euc3[, `:=`(is_1x2, all_1x2$is_1x2)]
plot_ly(mds_euc3, x = ~V1, y = ~V2, z = ~V3, color = ~is_1x2,
type = "scatter3d", mode = "markers") %>% layout(title = paste("MDS Euclidian and is_1x2 results",
bookmakers[n_bm], sep = " "))
}
bookmakers = c("1xBet", "bet365", "Betfair Exchange", "Pinnacle",
"10Bet")
func_2bman = function(n_bm) {
df = oo[bookmaker == bookmakers[n_bm]]
pdf_1 = dcast(df[(betType != "ou") & (betType != "ah")],
matchId + bookmaker ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
x = df[(betType == "ah") & (totalhandicap == "0")]
pdf_2 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
# only choose ou = 2.5 due to different handicap means
# different odds, so its not useful to mix different
# handicaps
x = df[(betType == "ou") & (totalhandicap == "2.5")]
pdf_3 = dcast(x, matchId ~ betType + oddtype, value.var = c("odd"),
fun.aggregate = mean)
pdf = na.omit(pdf_1[pdf_2, on = "matchId"][pdf_3, on = "matchId"])
mm[, `:=`(is_1x2, ifelse(home_goal > away_goal, "1", ifelse(home_goal ==
away_goal, "x", "2")))]
all_1x2 = na.omit(pdf[mm[, c("matchId", "is_1x2")], on = "matchId"])
all_1x2 = all_1x2[, `:=`(is_1x2, as.character(is_1x2))]
dist_man = dist(all_1x2[, 3:9], method = "manhattan")
mds_man3 = data.table(cmdscale(dist_man, eig = TRUE, k = 3)$points)
mds_man3[, `:=`(is_1x2, all_1x2$is_1x2)]
plot_ly(mds_man3, x = ~V1, y = ~V2, z = ~V3, color = ~is_1x2,
type = "scatter3d", mode = "markers") %>% layout(title = paste("MDS Manhattan and is_1x2 results",
bookmakers[n_bm], sep = " "))
}
func_2bman(1)
func_2beuc(1)
func_2bman(2)
func_2beuc(2)
func_2bman(3)
func_2beuc(3)
func_2bman(4)
func_2beuc(4)
func_2bman(5)
func_2beuc(5)